O dataset

Disponível em: http://insideairbnb.com/get-the-data.html Referencia para seguir: https://www.kaggle.com/josipdomazet/mining-nyc-airbnb-data-using-r

Download:

Importa o dataset

## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts --------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   id = col_double(),
##   scrape_id = col_double(),
##   last_scraped = col_date(format = ""),
##   thumbnail_url = col_logical(),
##   medium_url = col_logical(),
##   xl_picture_url = col_logical(),
##   host_id = col_double(),
##   host_since = col_date(format = ""),
##   host_is_superhost = col_logical(),
##   host_listings_count = col_double(),
##   host_total_listings_count = col_double(),
##   host_has_profile_pic = col_logical(),
##   host_identity_verified = col_logical(),
##   neighbourhood_group_cleansed = col_logical(),
##   latitude = col_double(),
##   longitude = col_double(),
##   is_location_exact = col_logical(),
##   accommodates = col_double(),
##   bathrooms = col_double(),
##   bedrooms = col_double()
##   # ... with 40 more columns
## )
## See spec(...) for full column specifications.
## Warning: 3 parsing failures.
##   row     col           expected             actual         file
##  1745 license 1/0/T/F/TRUE/FALSE +1512-6670366      <connection>
## 28567 license 1/0/T/F/TRUE/FALSE 56131/AL           <connection>
## 34253 license 1/0/T/F/TRUE/FALSE 05.557.336/0001-70 <connection>
## # A tibble: 35,451 x 23
##    name  host_name host_total_list~ calculated_host~ neighbourhood_c~
##    <chr> <fct>                <dbl>            <dbl> <fct>           
##  1 Very~ Matthias                 2                1 Copacabana      
##  2 Beau~ Viviane                  3                3 Copacabana      
##  3 NICE~ Renata                   1                1 Ipanema         
##  4 Cosy~ Patricia                 1                1 Ipanema         
##  5 COPA~ Patricia~                1                1 Copacabana      
##  6 Copa~ Seba                     1                1 Copacabana      
##  7 Beac~ Alex                     7                6 Ipanema         
##  8 Rio ~ Vana                     2                1 Copacabana      
##  9 4bed~ Marcio                   7                5 Copacabana      
## 10 HUma~ Marcio                   7                5 Humaitá         
## # ... with 35,441 more rows, and 18 more variables: latitude <dbl>,
## #   longitude <dbl>, property_type <fct>, room_type <fct>, price <dbl>,
## #   accommodates <dbl>, bedrooms <dbl>, minimum_nights <dbl>,
## #   maximum_nights <dbl>, availability_365 <dbl>, number_of_reviews <dbl>,
## #   review_scores_rating <dbl>, review_scores_accuracy <dbl>,
## #   review_scores_location <dbl>, review_scores_value <dbl>,
## #   cancellation_policy <fct>, require_guest_profile_picture <lgl>,
## #   require_guest_phone_verification <lgl>

Descrição dos dados

airbnb %>% summary()
##      name             host_name     host_total_listings_count
##  Length:35451       Daniel :  432   Min.   :  0.000          
##  Class :character   Ricardo:  322   1st Qu.:  1.000          
##  Mode  :character   Maria  :  315   Median :  1.000          
##                     Marcelo:  311   Mean   :  9.691          
##                     Mario  :  309   3rd Qu.:  3.000          
##                     (Other):33702   Max.   :776.000          
##                     NA's   :   60   NA's   :60               
##  calculated_host_listings_count              neighbourhood_cleansed
##  Min.   :  1.000                Copacabana              : 8825     
##  1st Qu.:  1.000                Barra da Tijuca         : 3908     
##  Median :  1.000                Ipanema                 : 2970     
##  Mean   :  7.885                Jacarepaguá             : 1917     
##  3rd Qu.:  2.000                Botafogo                : 1767     
##  Max.   :265.000                Recreio dos Bandeirantes: 1750     
##                                 (Other)                 :14314     
##     latitude        longitude                 property_type  
##  Min.   :-23.07   Min.   :-43.74   Apartment         :27023  
##  1st Qu.:-22.98   1st Qu.:-43.32   House             : 3709  
##  Median :-22.97   Median :-43.20   Condominium       : 1618  
##  Mean   :-22.96   Mean   :-43.25   Serviced apartment:  903  
##  3rd Qu.:-22.94   3rd Qu.:-43.19   Loft              :  653  
##  Max.   :-22.75   Max.   :-43.10   Bed and breakfast :  281  
##                                    (Other)           : 1264  
##            room_type         price          accommodates    
##  Entire home/apt:25006   Min.   :    0.0   Min.   :  1.000  
##  Private room   : 9586   1st Qu.:  150.0   1st Qu.:  2.000  
##  Shared room    :  859   Median :  281.0   Median :  4.000  
##                          Mean   :  622.2   Mean   :  4.175  
##                          3rd Qu.:  599.0   3rd Qu.:  5.000  
##                          Max.   :40000.0   Max.   :160.000  
##                                                             
##     bedrooms      minimum_nights     maximum_nights     availability_365
##  Min.   : 0.000   Min.   :   1.000   Min.   :       1   Min.   :  0.0   
##  1st Qu.: 1.000   1st Qu.:   1.000   1st Qu.:      30   1st Qu.:  0.0   
##  Median : 1.000   Median :   2.000   Median :    1125   Median :179.0   
##  Mean   : 1.637   Mean   :   4.736   Mean   :    1527   Mean   :190.1   
##  3rd Qu.: 2.000   3rd Qu.:   4.000   3rd Qu.:    1125   3rd Qu.:362.0   
##  Max.   :22.000   Max.   :1123.000   Max.   :10000000   Max.   :365.0   
##  NA's   :23                                                             
##  number_of_reviews review_scores_rating review_scores_accuracy
##  Min.   :  0.000   Min.   : 20.00       Min.   : 2.000        
##  1st Qu.:  0.000   1st Qu.: 93.00       1st Qu.:10.000        
##  Median :  1.000   Median : 98.00       Median :10.000        
##  Mean   :  7.952   Mean   : 94.35       Mean   : 9.617        
##  3rd Qu.:  5.000   3rd Qu.:100.00       3rd Qu.:10.000        
##  Max.   :350.000   Max.   :100.00       Max.   :10.000        
##                    NA's   :17455        NA's   :17473         
##  review_scores_location review_scores_value
##  Min.   : 2.000         Min.   : 2.000     
##  1st Qu.:10.000         1st Qu.: 9.000     
##  Median :10.000         Median :10.000     
##  Mean   : 9.723         Mean   : 9.312     
##  3rd Qu.:10.000         3rd Qu.:10.000     
##  Max.   :10.000         Max.   :10.000     
##  NA's   :17471          NA's   :17472      
##                   cancellation_policy require_guest_profile_picture
##  flexible                   :15552    Mode :logical                
##  moderate                   : 5757    FALSE:34873                  
##  strict                     :    2    TRUE :578                    
##  strict_14_with_grace_period:13589                                 
##  super_strict_30            :  163                                 
##  super_strict_60            :  388                                 
##                                                                    
##  require_guest_phone_verification
##  Mode :logical                   
##  FALSE:34850                     
##  TRUE :601                       
##                                  
##                                  
##                                  
## 
glimpse(airbnb)
## Observations: 35,451
## Variables: 23
## $ name                             <chr> "Very Nice 2Br - Copacabana -...
## $ host_name                        <fct> Matthias, Viviane, Renata, Pa...
## $ host_total_listings_count        <dbl> 2, 3, 1, 1, 1, 1, 7, 2, 7, 7,...
## $ calculated_host_listings_count   <dbl> 1, 3, 1, 1, 1, 1, 6, 1, 5, 5,...
## $ neighbourhood_cleansed           <fct> Copacabana, Copacabana, Ipane...
## $ latitude                         <dbl> -22.96592, -22.97712, -22.983...
## $ longitude                        <dbl> -43.17896, -43.19045, -43.214...
## $ property_type                    <fct> Condominium, Apartment, Apart...
## $ room_type                        <fct> Entire home/apt, Entire home/...
## $ price                            <dbl> 296, 161, 243, 337, 221, 150,...
## $ accommodates                     <dbl> 5, 3, 3, 3, 2, 2, 13, 1, 11, ...
## $ bedrooms                         <dbl> 2, 1, 1, 1, 1, 1, 6, 1, 4, 1,...
## $ minimum_nights                   <dbl> 4, 4, 2, 2, 3, 2, 2, 3, 4, 5,...
## $ maximum_nights                   <dbl> 30, 30, 1125, 89, 28, 30, 89,...
## $ availability_365                 <dbl> 332, 352, 125, 122, 145, 89, ...
## $ number_of_reviews                <dbl> 233, 232, 260, 160, 303, 1, 5...
## $ review_scores_rating             <dbl> 93, 94, 96, 94, 98, NA, 91, 9...
## $ review_scores_accuracy           <dbl> 9, 9, 10, 10, 10, NA, 9, 10, ...
## $ review_scores_location           <dbl> 10, 10, 10, 10, 10, NA, 10, 1...
## $ review_scores_value              <dbl> 9, 9, 10, 9, 10, NA, 9, 10, 6...
## $ cancellation_policy              <fct> strict_14_with_grace_period, ...
## $ require_guest_profile_picture    <lgl> FALSE, TRUE, FALSE, TRUE, FAL...
## $ require_guest_phone_verification <lgl> FALSE, TRUE, FALSE, TRUE, TRU...

Missing Data

# Remove sem reviews
airbnb <- airbnb %>% filter(number_of_reviews != 0)
# Remove preço 0
airbnb <- airbnb %>% filter(price != 0)
# Remove sem review_scores_accuracy
airbnb <- airbnb %>% drop_na(review_scores_accuracy)
airbnb <- airbnb %>% drop_na(review_scores_value)
airbnb <- airbnb %>% drop_na(review_scores_rating)
airbnb <- airbnb %>% drop_na(review_scores_location)

airbnb <- airbnb %>% drop_na(bedrooms)
missing_airbnb <- summarise_all(airbnb %>% drop_na(review_scores_accuracy), ~sum(is.na(.)))
missing_airbnb <- gather(missing_airbnb, key = "variables", value = "missing")
missing_airbnb %>% filter(missing > 0)
## # A tibble: 3 x 2
##   variables                 missing
##   <chr>                       <int>
## 1 name                            2
## 2 host_name                      40
## 3 host_total_listings_count      40

Visualização

Bairros

n_bairros <- 7 

bairros <- airbnb %>% 
  group_by(neighbourhood_cleansed) %>% 
  tally(sort=TRUE) %>%
  group_by(bairro = factor(c(
    as.character(neighbourhood_cleansed[1:n_bairros]), rep("Outros", n() - n_bairros)),
    levels = c(as.character(neighbourhood_cleansed[1:n_bairros]), "Outros"))) %>%
  tally(n) 

bairros %>%
  ggplot(aes(bairro, n, fill=bairro)) +
  geom_bar(stat="identity") +
  geom_text(aes(label=n), vjust=-0.4, size=3.5) +
  theme(legend.position = "none") +
  xlab("Bairro") +
  ylab("Frquência")

Tipo de quarto

ggplot(airbnb, aes(x=room_type, fill=room_type)) + 
  geom_bar() + 
  geom_text(stat='count', aes(label=..count..), vjust=-0.4, size=3.5)

Tipo de propriedade

n_tipos <- 6

tipos_propriedade <- airbnb %>% 
  group_by(property_type) %>% 
  tally(sort=TRUE) %>%
  group_by(tipo_propriedade = factor(c(
    as.character(property_type[1:n_tipos]), rep("Outros", n() - n_tipos)),
    levels = c(as.character(property_type[1:n_tipos]), "Outros"))) %>%
  tally(n) 

tipos_propriedade %>%
  ggplot(aes(tipo_propriedade, n, fill=tipo_propriedade)) +
  geom_bar(stat="identity", legend=NULL) +
  geom_text(aes(label=n), vjust=-0.4, size=3.5) +
  xlab("Tipo de propriedade") +
  ylab("Frequência") +
  theme(axis.text = element_blank())
## Warning: Ignoring unknown parameters: legend

Política de cancelamento

politicas_cancelamento <- airbnb %>%
  group_by(cancellation_policy) %>%
  tally(sort=TRUE)
politicas_cancelamento
## # A tibble: 6 x 2
##   cancellation_policy             n
##   <fct>                       <int>
## 1 strict_14_with_grace_period  8722
## 2 flexible                     4938
## 3 moderate                     3971
## 4 super_strict_60               223
## 5 super_strict_30                90
## 6 strict                          2
politicas_cancelamento %>%
  ggplot(aes(x=reorder(cancellation_policy, -n), y=n, fill=reorder(cancellation_policy, -n))) + 
  geom_bar(stat="identity") + 
  geom_text(aes(label=n), vjust=-0.4, size=3.5) +
  theme(axis.text.x = element_blank()) +
  xlab("Política de cancelamento") +
  ylab("Frequência") +
  labs(fill="Política de cancelamento")

Requer foto de perfil do hóspede

blank_theme <- theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    axis.text.x=element_blank(),
    panel.border = element_blank(),
    panel.grid=element_blank(),
    axis.ticks = element_blank(),
    plot.title=element_text(size=14, face="bold") 
  )

airbnb %>%
  ggplot(aes(x="", fill=require_guest_profile_picture)) +
  geom_bar(width=1) +
  coord_polar("y", start=0) +
  blank_theme + 
  geom_text(stat='count',aes(label=..count..), position = position_stack(vjust = 0.5), color="white") +
  labs(fill="") +
  ggtitle("Requer foto de perfil do hóspede")

Requer que o hóspede tenha telefone verificado

airbnb %>%
  ggplot(aes(x="", fill=require_guest_phone_verification)) +
  geom_bar(width=1) +
  coord_polar("y", start=0) +
  blank_theme + 
  geom_text(stat='count',aes(label=..count..), position = position_stack(vjust = 0.5), color="white") +
  labs(fill="") +
  ggtitle("Requer que o hóspede tenha telefone verificado")

Preço

ggplot(airbnb, aes(price, fill=room_type)) +
  geom_histogram(bins = 30) + 
  #geom_density(alpha = 0.2, fill = "purple") +
  ggtitle("Distribução de preço",
          subtitle = "A distribuição é muito inclinada") +
  theme(axis.title = element_text(), axis.title.x = element_text())

  #geom_vline(xintercept = round(mean(airbnb$price), 2), size = 2, linetype = 3)
ggplot(airbnb, aes(price, fill=room_type)) +
  geom_histogram(bins = 30) + 
  ggtitle("Distribuição transformada do preço",
          subtitle = expression("Com uma transformação" ~'log'[10] ~ "do eixo x")) +
  #theme(axis.title = element_text(), axis.title.x = element_text()) +
  #geom_vline(xintercept = round(mean(airbnb$price), 2), size = 2, linetype = 3) +
  scale_x_log10()

  #annotate("text", x = 1800, y = 0.75,label = paste("Mean price = ", paste0(round(mean(airbnb$price), 2), "$")),
  #         color =  "#32CD32", size = 8)
ggplot(airbnb, aes(price, fill=room_type)) +
  geom_histogram(bins = 30, aes(y = ..density..), show.legend = FALSE) +
  facet_wrap(~room_type) +  
  scale_x_log10() 

ggplot(airbnb, aes(x = room_type, y = price)) +
  geom_boxplot(aes(fill = room_type)) + scale_y_log10() +
  xlab("Tipo de quarto") + 
  ylab("Preço") +
  ggtitle("Boxplots of price by room type",
          subtitle = "Entire homes and apartments have the highest avg price") +
  geom_hline(yintercept = mean(airbnb$price), color = "purple", linetype = 2) +
  theme(legend.position = "none")

Correlação

library(corrplot)
## corrplot 0.84 loaded
airbnb$log_price = log(airbnb$price)
airbnb_cor <- airbnb[, sapply(airbnb, is.numeric)]
airbnb_cor <- airbnb_cor[complete.cases(airbnb_cor), ]
correlation_matrix <- cor(airbnb_cor, method = "spearman")
corrplot(correlation_matrix, method = "color")

Mapa com as listagens

pal <- colorFactor(palette = c("red", "green", "blue", "purple", "yellow"), domain = airbnb$room_type)

leaflet(data = airbnb) %>% 
  addProviderTiles(providers$CartoDB.DarkMatterNoLabels) %>% 
  addCircleMarkers(~longitude, 
    ~latitude, 
    color=~pal(room_type), 
    weight = 1, 
    radius=1, 
    fillOpacity = 0.1, 
    opacity = 1,
    label = paste("Name:", airbnb$name)) %>% addLegend("bottomright", pal = pal, values = ~room_type,
            title = "Room types",
            opacity = 1)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(airbnb, x = ~longitude, y = ~latitude, z = ~price, color = ~room_type) 
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode